In [1]:
import gzip
import json
import os
import re

from __future__ import print_function, unicode_literals

In [2]:
catalog_dir = os.path.join('..','catalogs')

In [3]:
files = list(filter(lambda x: bool(re.search('json\.gz$', x)), os.listdir(catalog_dir)))

In [4]:
files


Out[4]:
[u'ecoinvent_3.2_apos_xlsx.json.gz',
 u'ecoinvent_3.2_consequential_xlsx.json.gz',
 u'ecoinvent_3.2_cut-off_xlsx.json.gz',
 u'ecoinvent_3.2_undefined_xlsx.json.gz',
 u'ei3.2_undefined_spold.json.gz',
 u'ei_lcia.json.gz',
 u'elcd_3.2.json.gz',
 u'elcd_lcia.json.gz',
 u'gabi_2016_all-extensions.json.gz',
 u'gabi_2016_professional-database-2016.json.gz',
 u'uslci_ecospold.json.gz']

Select databases of interest


In [5]:
my_files = [os.path.join(catalog_dir, files[k]) for k in (3, 6, 9, 8, 10)]  # we love list comprehensions!

In [6]:
my_files


Out[6]:
[u'../catalogs/ecoinvent_3.2_undefined_xlsx.json.gz',
 u'../catalogs/elcd_3.2.json.gz',
 u'../catalogs/gabi_2016_professional-database-2016.json.gz',
 u'../catalogs/gabi_2016_all-extensions.json.gz',
 u'../catalogs/uslci_ecospold.json.gz']

Give the databases short names for reference


In [7]:
names = ['EI (u)', 'ELCD', 'GaBi-Pro', 'GaBi-Ext', 'US LCI']

In [8]:
def load_archive(filename):
    with gzip.open(filename, 'r') as fp:
        J = json.load(fp)

    return J

Load all the archives into a collection called 'C'


In [9]:
C = [load_archive(fname) for fname in my_files]

(should take about 2-5 seconds)

Create geography grid (Table 4 in manuscript)


In [10]:
geog = []
for i, archive in enumerate(C):
    for p in archive['processes']:
        geog.append({'db': names[i], 'process': p['Name'], 'geog': p['SpatialScope']})

there should be one entry in geog for each process listed in a database - total of around 25,000 processes


In [11]:
len(geog)


Out[11]:
25287

use pandas to draw the pivot charts


In [12]:
import pandas as pd

In [13]:
P = pd.DataFrame(geog).pivot_table(index='geog', columns='db',  aggfunc=len, fill_value='', margins=True)

In [14]:
P.sort_values(by=('process','All'), ascending=False)[:20]  # only show the top 20 rows


Out[14]:
process
db EI (u) ELCD GaBi-Ext GaBi-Pro US LCI All
geog
All 13307 503 7457 3319 701 25287
GLO 6218 25 446 338 15 7042
DE 168 19 2131 314 2632
US 92 1179 137 16 1424
RNA 13 649 19 667 1348
CH 1260 10 44 33 1347
RER 1136 75 14 84 3 1312
EU-27 96 296 869 1261
CA-QC 346 346
IN 60 187 55 302
IT 73 11 149 52 285
BR 66 153 59 278
NL 76 10 117 65 268
CN 65 99 99 263
FR 94 10 106 50 260
GB 70 10 97 62 239
AT 73 10 94 47 224
ES 75 10 75 34 194
SE 77 10 60 46 193
BE 67 10 72 44 193

Count Reference Flow Frequency (table 3 in manuscript)


In [15]:
from collections import Counter

In [16]:
def create_flow_map(archive):
    """
    This function creates a hash map from the entityId to the entity's tags- very fast
    """
    flow_map = dict()
    for f in archive['flows']:
        flow_map[f['entityId']] = f
    return flow_map

In [17]:
def count_ref_flows(archive):
    rfs = Counter()
    flow_map = create_flow_map(archive)
    for i in archive['processes']:
        x = [v for v in i['exchanges'] if 'isReference' in v and v['isReference'] is True]
        if len(x) == 0:
            count_key = (None, None)
            rfs[count_key] += 1
        else:
            for xc in x:
                direc = xc['direction']
                flowref = xc['flow']
                try:
                    flowname = flow_map[flowref]['Name']
                except KeyError:
                    flowname = flow_map[int(flowref)]['Name']
                except KeyError:
                    flowname = 'Flow Not Found!'
                count_key = (direc, flowname)
                rfs[count_key] += 1
    return rfs

In [18]:
rf_count = []
for i, archive in enumerate(C):
    print('Parsing archive %s' % names[i])
    rfs = count_ref_flows(archive)
    for rf, count in rfs.items():
        try:
            rf_count.append({'db': names[i], 'exchange': '%s: %s' % (rf[0], rf[1]), 'count': count})
        except TypeError:
            print('rf: %s (type %s) count: %d' %(rf, type(rf), count))


Parsing archive EI (u)
Parsing archive ELCD
Parsing archive GaBi-Pro
Parsing archive GaBi-Ext
Parsing archive US LCI

(should take << 1 second)


In [19]:
RF = pd.DataFrame(rf_count).pivot_table(index='exchange', columns='db', aggfunc=sum, fill_value='', margins=True)

In [20]:
RF.sort_values(('count','All'), ascending=False)[:20]


Out[20]:
count
db EI (u) ELCD GaBi-Ext GaBi-Pro US LCI All
exchange
All 14158 503 7457 3319 701 26138
Output: electricity, high voltage 2350 2350
Output: Thermal energy (MJ) 944 236 1180
Output: Electricity 64 522 471 1057
Output: Steam (MJ) 340 622 962
Output: electricity, low voltage 730 730
Input: Housing technology 340 191 531
Output: electricity, medium voltage 423 423
Output: heat, district or industrial, other than natural gas 402 402
Output: Cargo 80 127 207
Output: heat, district or industrial, natural gas 141 141
Output: heat, central or small-scale, other than natural gas 139 139
None: None 3 107 10 1 121
Input: Aluminium scrap 60 59 119
Input: Incineration good 53 58 111
Input: Steel scrap 55 54 109
Output: blast furnace gas 108 108
Output: biogas 97 97
Output: Elevator utilization 90 6 96
Output: Used cooling (Emission air conditioner) 86 4 90

Text Co-occurrence (table 5 in manuscript)


In [21]:
from collections import defaultdict  #, Counter  # already imported

def tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/'):
    """
    tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/')

    Parse the specified fields to generate a list of tags, delimited as specified
    
    entity: a JSON serialized entity
    look_in: a set of fields to extract tags from
    delimiter: regexp for re.split()  Default specifies: 
       semicolon with trailing space OR
       comma with trailing space OR
       open parens with leading space OR
       close parens with trailing space OR
       slash
    
    """
    tags = set()
    for k, v in entity.items():
        if v is None: continue
        if k in look_in:
            try:
                tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, v)))
            except TypeError:
                tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, ', '.join(v))))
                        
    return tags

def count_tags(e_list, search=None, include=None, exclude=None):
    """
    count_tags(e_list, search=None, include=None, exclude=None)
    Extract tags from entity list.
    Optional search term: only counts entities where the search term is found
    
    Default fields: 'Name', 'Comment', 'SpatialScope', 'Classifications'
    add additional fields with include=; remove fields with exclude=
    
    Returns d, m
    d = a Counter object containing tags with their counts
    m = a dictionary: keys are tags, values are lists of entity IDs bearing the tag
    """
    look_in = {'Name', 'Comment', 'SpatialScope', 'Classifications'}
    if include is not None:
        look_in = look_in.union(set(include))
    if exclude is not None:
        look_in = look_in.difference(set(exclude))
    d = Counter()
    m = defaultdict(list)
    for e in e_list:
        t = tags(e, look_in)
        if search is not None:
            if not any([bool(re.search(search, k, flags=re.IGNORECASE)) for k in t]):
                continue
        for i in t:
            d[i] += 1
            m[i].append(e['entityId'])
    return d, dict(m)

In [22]:
for i, archive in enumerate(C):
    """
    Search each catalog for processes containing the term 'EURO.?[0-9]' and print their most common tags 
    """
    print('\n%s:' % names[i])
    d0, m0 = count_tags(archive['processes'], search='EURO.?[0-9]', include=['TechnologyLevel', 'IsicClass'])
    print([k for k in d0.most_common() if k[1] > 10])


EI (u):
[(u'SpatialScope=GLO', 190), (u'Name=transport', 173), (u'IsicClass=Freight transport by road', 160), (u'Name=freight', 160), (u'Name=passenger car', 95), (u'IsicClass=Other passenger land transport', 95), (u'TechnologyLevel=Current', 91), (u'Name=market for transport', 82), (u'TechnologyLevel=Undefined', 82), (u'SpatialScope=RER', 65), (u'Name=lorry with refrigeration machine', 64), (u'Comment=internal combustion engine', 61), (u'Name=EURO3', 40), (u'Name=EURO5', 40), (u'Name=EURO4', 40), (u'Name=R134a refrigerant', 40), (u'Name=EURO 5', 33), (u'Name=3.5-7.5 ton', 32), (u'Name=EURO6', 32), (u'Name=carbon dioxide', 32), (u'Name=7.5-16 ton', 32), (u'Name=EURO 3', 31), (u'Name=EURO 4', 31), (u'TechnologyLevel=Modern', 30), (u'Name=medium size', 29), (u'TechnologyLevel=New', 28), (u'Name=large size', 27), (u'Name=natural gas', 27), (u'Name=small size', 27), (u'Name=diesel', 27), (u'Name=petrol', 26), (u'TechnologyLevel=Old', 24), (u'Name=cooling', 24), (u'Name=freezing', 24), (u'Comment=linking activity', 24), (u'Name=lorry 7.5-16 metric ton', 18), (u'Name=lorry >32 metric ton', 18), (u'Name=lorry 3.5-7.5 metric ton', 18), (u'Name=lorry 16-32 metric ton', 18), (u'Name=small lorry with refrigeration machine', 16), (u'Name=...', 16), (u'Name=liquid refrigerant', 16), (u'Comment=ConvertedDataset', 12)]

ELCD:
[]

GaBi-Pro:
[(u'Classifications=Processes', 81), (u'Classifications=Truck', 79), (u'Comment=driving share motorway', 76), (u'Classifications=Transport', 76), (u'SpatialScope=GLO', 76), (u'Comment=rural', 76), (u'Comment=utilisation [-]', 76), (u'Comment=payload [t]', 76), (u'Classifications=Road', 76), (u'Comment=status January 2010\n- input parameter: distance [km]', 76), (u'Comment=driving share: HBEFA 3.1', 76), (u'Comment=urban\n- average sulphur content: EU = 10 ppm', 68), (u'Comment=sulphur content diesel [ppm]', 58), (u'Name=Truck', 50), (u'Name=Truck-trailer', 20), (u'Comment=- source emissions', 12), (u'Comment=- average emission values Euro 3\n- source emissions', 12), (u'Comment=- average emission values Euro 6 SCR', 12), (u'Comment=- average emission values Euro 1\n- source emissions', 12), (u'Comment=- average emission values Euro 4\n- source emissions', 12), (u'Comment=- average emission values Euro 2\n- source emissions', 12), (u'Comment=non tampered', 12), (u'Comment=- average emission values Euro 5 SCR\n- source emissions', 11)]

GaBi-Ext:
[(u'Classifications=Processes', 52), (u'Classifications=Truck', 48), (u'Comment=driving share motorway', 42), (u'Classifications=Transport', 42), (u'Comment=rural', 42), (u'Comment=utilisation [-]', 42), (u'Comment=payload [t]', 42), (u'Classifications=Road', 42), (u'Comment=sulphur content diesel [ppm]', 40), (u'Comment=04', 36), (u'SpatialScope=IN', 36), (u'Comment=CSIR - Central Road Research Institute New Dehli-25', 36), (u'Comment=2014\n- input parameter: distance [km]', 36), (u'Comment=status January 2010\n- Driving share: Fuel efficicency standards of heavy duty vehicles in India', 36), (u'Comment=urban\n- average sulphur content: IN = 350 ppm', 35), (u'Name=Truck', 29), (u'Comment=- average emission values Bharat stage III considered as equal to Euro 3\n- source emissions:  HBEFA 3.1', 12), (u'Comment=- average emission values Bharat stage IV considered as equal to Euro 4\n- source emissions:  HBEFA 3.1', 12), (u'Comment=- average emission values Bharat stage II considered as equal to Euro 2\n- source emissions:  HBEFA 3.1', 12)]

US LCI:
[]

After that, it's just some formatting to get it into tabular form